This notebook compares doublet results calculated on benchmarking datasets to one another, with the primary goal of addressing these questions:
There are several items to bear in mind when interpreting these results:
cxds, as we are using it, does not have a specific
threshold for calling droplets. By contrast, both scrublet
and scDblFinder identify a threshold based on the given
dataset they are processing. This notebook used a doublet threshold of
>=0.5 for cxds, which may not be
universally suitable (though choosing a universally suitable threshold
is not easy either!).suppressPackageStartupMessages({
library(SingleCellExperiment)
library(ggplot2)
library(patchwork)
library(caret)
library(UpSetR)
})
theme_set(theme_bw())
# define threshold used to call cxds
cxds_threshold <- 0.5
module_base <- rprojroot::find_root(rprojroot::is_renv_project)
data_dir <- file.path(module_base, "scratch", "benchmark-datasets")
result_dir <- file.path(module_base, "results", "benchmark-results")
plot_pca_calls <- function(df,
color_column,
pred_column,
title) {
# Plot PCs colored by singlet/doublet, showing doublets on top
# df is expected to contain columns PC1, PC2, `color_column`, and `pred_column`. These should _not_ be provided as strings
ggplot(df) +
aes(x = PC1,
y = PC2,
color = {{color_column}}) +
geom_point(
size = 0.75,
alpha = 0.6
) +
scale_color_manual(name = "", values = c("black", "gray80")) +
geom_point(
data = dplyr::filter(df, {{color_column}} == "doublet"),
color = "black",
size = 0.75
) +
ggtitle(title)
}
plot_pca_metrics <- function(df, color_column, metric_colors) {
# Plot PCs colored by performance metric, showing false calls on top
# metric_colors is a named vector of colors used for coloring tp/tn/fp/fn
# df is expected to contain columns PC1, PC2, and `color_column`. This should _not_ be provided as a string.
ggplot(df) +
aes(x = PC1,
y = PC2,
color = {{color_column}}) +
geom_point(
size = 0.75,
alpha = 0.6
) +
geom_point(
data = dplyr::filter(df, {{color_column}} %in% c("fp", "fn")),
size = 0.75
) +
scale_color_manual(name = "Call type", values = metric_colors) +
ggtitle("Consensus call metrics")
}
First, we’ll read in and combine doublet results into a list of data frames for each dataset. We’ll also create new columns for each dataset:
consensus_call, which will be “doublet” if all
methods predict “doublet,” and “singlet” otherwisecall_type, which will classify the consensus call as
one of “tp”, “tn”, “fp”, or “fn” (true/false positive/negative)# find all dataset names to process:
dataset_names <- list.files(result_dir, pattern = "*_scrublet.tsv") |>
stringr::str_remove("_scrublet.tsv")
# used in PCA plots
confusion_colors <- c(
"tp" = "lightblue",
"tn" = "pink",
"fp" = "blue",
"fn" = "firebrick2"
)
# Read in data for analysis
doublet_df_list <- dataset_names |>
purrr::map(
\(dataset) {
scdbl_tsv <- file.path(result_dir, glue::glue("{dataset}_scdblfinder.tsv"))
scrub_tsv <- file.path(result_dir, glue::glue("{dataset}_scrublet.tsv"))
sce_file <- file.path(data_dir, dataset, glue::glue("{dataset}_sce.rds"))
scdbl_df <- scdbl_tsv |>
readr::read_tsv(show_col_types = FALSE) |>
dplyr::select(
barcodes,
cxds_score,
scdbl_score = score,
scdbl_prediction = class
) |>
# add cxds calls at `cxds_threshold` threshold
dplyr::mutate(
cxds_prediction = dplyr::if_else(
cxds_score >= cxds_threshold,
"doublet",
"singlet"
)
)
scrub_df <- readr::read_tsv(scrub_tsv, show_col_types = FALSE)
# grab ground truth and PCA coordinates
sce <- readr::read_rds(sce_file)
dataset_df <- scuttle::makePerCellDF(sce, use.dimred = "PCA") |>
tibble::rownames_to_column(var = "barcodes") |>
dplyr::select(barcodes,
ground_truth = ground_truth_doublets,
PC1 = PCA.1,
PC2 = PCA.2) |>
dplyr::left_join(
scrub_df,
by = "barcodes"
) |>
dplyr::left_join(
scdbl_df,
by = "barcodes"
)
# Add a consensus call column
dataset_df <- dataset_df |>
dplyr::rowwise() |>
dplyr::mutate(consensus_call = dplyr::if_else(
all(
c(scdbl_prediction, scrublet_prediction, cxds_prediction) == "doublet"
),
"doublet",
"singlet"
)) |>
dplyr::mutate(
call_type = dplyr::case_when(
consensus_call == "doublet" && ground_truth == "doublet" ~ "tp",
consensus_call == "singlet" && ground_truth == "singlet" ~ "tn",
consensus_call == "doublet" && ground_truth == "singlet" ~ "fp",
consensus_call == "singlet" && ground_truth == "doublet" ~ "fn"
)
)
return(dataset_df)
}
) |>
purrr::set_names(dataset_names)
This section contains upset plots that show overlap across doublet calls from each method, displayed for each dataset.
upset_list <- doublet_df_list |>
purrr::iwalk(
\(df, dataset) {
doublet_barcodes <- list(
"scDblFinder" = df$barcodes[df$scdbl_prediction == "doublet"],
"scrublet" = df$barcodes[df$scrublet_prediction == "doublet"],
"cxds" = df$barcodes[df$cxds_prediction == "doublet"]
)
UpSetR::upset(fromList(doublet_barcodes), order.by = "freq") |> print()
grid::grid.text( # plot title
dataset,
x = 0.65,
y = 0.95,
gp = grid::gpar(fontsize=16)
)
}
)
This section visualizes and evaluates the consensus doublet calls across each dataset.
This section plots the PCA for each dataset, clockwise from the top left:
scDblFinder singlet/doublet calls.scrublet singlet/doublet callscxds singlet/doublet callstp), true negative (tn),
false positive (fp), false negative (fn)For the first five panels, doublets are shown in black and singlets in light gray.
doublet_df_list |>
purrr::iwalk(
\(df, dataset) {
# Top row: methods themselves ---
p1 <- plot_pca_calls(
df,
color_column = scdbl_prediction,
title = "scDblFinder prediction"
)
p2 <- plot_pca_calls(
df,
color_column = scrublet_prediction,
title = "scrublet prediction"
)
p3 <- plot_pca_calls(
df,
color_column = cxds_prediction,
title = "cxds prediction"
)
# Bottom row: consensus --
p4 <- plot_pca_calls(
df,
color_column = ground_truth,
title = "Ground truth"
)
# Second, consensus call
p5 <- plot_pca_calls(
df,
color_column = consensus_call,
title = "Consensus call"
)
# Third, call type
p6 <- plot_pca_metrics(
df,
call_type,
metric_colors = confusion_colors
)
# combine and plot
plot(
(p1 + p2 + p3) / (p4 + p5 + p6) +
plot_annotation(
glue::glue("PCA for {dataset}"),
theme = theme(plot.title = element_text(size = 16))) +
plot_layout(guides = "collect"))
}
)
This section calculates a confusion matrix and associated statistics on the consensus calls.
metric_df <- doublet_df_list |>
purrr::imap(
\(df, dataset) {
print(glue::glue("======================== {dataset} ========================"))
cat("Table of consensus calls:")
print(table(df$consensus_call))
cat("\n\n")
confusion_result <- caret::confusionMatrix(
# truth should be first
table(
"Truth" = df$ground_truth,
"Consensus prediction" = df$consensus_call
),
positive = "doublet"
)
print(confusion_result)
# Extract information we want to present later in a table
tibble::tibble(
"Dataset name" = dataset,
"Kappa" = round(confusion_result$overall["Kappa"], 3),
"Balanced accuracy" = round(confusion_result$byClass["Balanced Accuracy"], 3)
)
}
) |>
dplyr::bind_rows()
======================== hm-6k ========================
Table of consensus calls:
doublet singlet
138 6668
Confusion Matrix and Statistics
Consensus prediction
Truth doublet singlet
doublet 138 33
singlet 0 6635
Accuracy : 0.9952
95% CI : (0.9932, 0.9967)
No Information Rate : 0.9797
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8908
Mcnemar's Test P-Value : 2.54e-08
Sensitivity : 1.00000
Specificity : 0.99505
Pos Pred Value : 0.80702
Neg Pred Value : 1.00000
Prevalence : 0.02028
Detection Rate : 0.02028
Detection Prevalence : 0.02512
Balanced Accuracy : 0.99753
'Positive' Class : doublet
======================== HMEC-orig-MULTI ========================
Table of consensus calls:
doublet singlet
641 25785
Confusion Matrix and Statistics
Consensus prediction
Truth doublet singlet
doublet 506 3062
singlet 135 22723
Accuracy : 0.879
95% CI : (0.875, 0.8829)
No Information Rate : 0.9757
P-Value [Acc > NIR] : 1
Kappa : 0.2079
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.78939
Specificity : 0.88125
Pos Pred Value : 0.14182
Neg Pred Value : 0.99409
Prevalence : 0.02426
Detection Rate : 0.01915
Detection Prevalence : 0.13502
Balanced Accuracy : 0.83532
'Positive' Class : doublet
======================== pbmc-1B-dm ========================
Table of consensus calls:
doublet singlet
38 3752
Confusion Matrix and Statistics
Consensus prediction
Truth doublet singlet
doublet 26 104
singlet 12 3648
Accuracy : 0.9694
95% CI : (0.9634, 0.9746)
No Information Rate : 0.99
P-Value [Acc > NIR] : 1
Kappa : 0.2986
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.68421
Specificity : 0.97228
Pos Pred Value : 0.20000
Neg Pred Value : 0.99672
Prevalence : 0.01003
Detection Rate : 0.00686
Detection Prevalence : 0.03430
Balanced Accuracy : 0.82825
'Positive' Class : doublet
======================== pdx-MULTI ========================
Table of consensus calls:
doublet singlet
4 10292
Confusion Matrix and Statistics
Consensus prediction
Truth doublet singlet
doublet 3 1314
singlet 1 8978
Accuracy : 0.8723
95% CI : (0.8657, 0.8787)
No Information Rate : 0.9996
P-Value [Acc > NIR] : 1
Kappa : 0.0038
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.7500000
Specificity : 0.8723280
Pos Pred Value : 0.0022779
Neg Pred Value : 0.9998886
Prevalence : 0.0003885
Detection Rate : 0.0002914
Detection Prevalence : 0.1279138
Balanced Accuracy : 0.8111640
'Positive' Class : doublet
Overall, methods do not have substantial overlap with each other. They each tend to detect different sets of doublets, leading to fairly small sets of consensus doublets. Further, the consensus doublets called by all three methods have some, but not substantial, overlap with the ground truth.
For three out of four datasets, scDblFinder predicts a
much larger number of doublets compared to other methods. For
pdx-MULTI, however, cxds predicts a much
larger number of droplets.
Based on the PCAs, it additionally looks like there are many more
false negatives in the consensus prediction that cxds
appears to capture but other methods do not.
The table below summarizes performance of the “consensus caller”.
Note that, in the benchmarking paper
these datasets were originally analyzed in, hm-6k was
observed to be one of the “easiest” datasets to classify across
methods.
Consistent with that observation, it has the highest kappa
value here, although it is still fairly low - though not as low as the
other methods, which are very close to 0.
metric_df
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.5
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] stats4 stats graphics grDevices datasets utils methods
[8] base
other attached packages:
[1] UpSetR_1.4.0 caret_6.0-94
[3] lattice_0.22-6 patchwork_1.2.0
[5] ggplot2_3.5.1 SingleCellExperiment_1.26.0
[7] SummarizedExperiment_1.34.0 Biobase_2.64.0
[9] GenomicRanges_1.56.0 GenomeInfoDb_1.40.0
[11] IRanges_2.38.0 S4Vectors_0.42.0
[13] BiocGenerics_0.50.0 MatrixGenerics_1.16.0
[15] matrixStats_1.3.0
loaded via a namespace (and not attached):
[1] pROC_1.18.5 gridExtra_2.3
[3] rlang_1.1.3 magrittr_2.0.3
[5] e1071_1.7-14 compiler_4.4.0
[7] DelayedMatrixStats_1.26.0 vctrs_0.6.5
[9] reshape2_1.4.4 stringr_1.5.1
[11] pkgconfig_2.0.3 crayon_1.5.2
[13] XVector_0.44.0 labeling_0.4.3
[15] scuttle_1.14.0 utf8_1.2.4
[17] prodlim_2023.08.28 tzdb_0.4.0
[19] UCSC.utils_1.0.0 bit_4.0.5
[21] purrr_1.0.2 xfun_0.43
[23] beachmat_2.20.0 zlibbioc_1.50.0
[25] jsonlite_1.8.8 recipes_1.0.10
[27] DelayedArray_0.30.1 BiocParallel_1.38.0
[29] parallel_4.4.0 R6_2.5.1
[31] stringi_1.8.4 parallelly_1.37.1
[33] rpart_4.1.23 lubridate_1.9.3
[35] Rcpp_1.0.12 iterators_1.0.14
[37] knitr_1.46 future.apply_1.11.2
[39] readr_2.1.5 Matrix_1.7-0
[41] splines_4.4.0 nnet_7.3-19
[43] timechange_0.3.0 tidyselect_1.2.1
[45] rstudioapi_0.16.0 abind_1.4-5
[47] yaml_2.3.8 timeDate_4032.109
[49] codetools_0.2-20 listenv_0.9.1
[51] tibble_3.2.1 plyr_1.8.9
[53] withr_3.0.0 future_1.33.2
[55] survival_3.5-8 proxy_0.4-27
[57] pillar_1.9.0 BiocManager_1.30.23
[59] renv_1.0.7 foreach_1.5.2
[61] generics_0.1.3 vroom_1.6.5
[63] rprojroot_2.0.4 hms_1.1.3
[65] sparseMatrixStats_1.16.0 munsell_0.5.1
[67] scales_1.3.0 globals_0.16.3
[69] class_7.3-22 glue_1.7.0
[71] tools_4.4.0 data.table_1.15.4
[73] ModelMetrics_1.2.2.2 gower_1.0.1
[75] grid_4.4.0 ipred_0.9-14
[77] colorspace_2.1-0 nlme_3.1-164
[79] GenomeInfoDbData_1.2.12 cli_3.6.2
[81] fansi_1.0.6 S4Arrays_1.4.0
[83] lava_1.8.0 dplyr_1.1.4
[85] gtable_0.3.5 digest_0.6.35
[87] SparseArray_1.4.3 farver_2.1.1
[89] lifecycle_1.0.4 hardhat_1.3.1
[91] httr_1.4.7 bit64_4.0.5
[93] MASS_7.3-60.2